This pipeline performs reproducible microbiome sequence analysis using DADA2 for sequence processing and phyloseq for data management and visualization. The pipeline supports both bacterial and fungal amplicon sequencing data.
# Load required packages
library(dada2)
library(phyloseq)
library(ggplot2)
library(yaml)
library(dplyr)
library(tidyr)
# Set seed for reproducibility
set.seed(123)
# Load configuration
config <- yaml::read_yaml("config.yaml")
analysis_type <- config$analysis_type
paste("Analysis type:", analysis_type, "\n")## [1] "Analysis type: bacteria \n"
# Define paths
path <- "data/raw" # Path to raw fastq files
output_path <- "output"
fig_path <- file.path(output_path, "figures")
table_path <- file.path(output_path, "tables")
# Create output directories if they don't exist
dir.create(fig_path, showWarnings = FALSE, recursive = TRUE)
dir.create(table_path, showWarnings = FALSE, recursive = TRUE)
# List files
fnFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern = "_R2_001.fastq", full.names = TRUE))
# Extract sample names
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1) # this is the hardest custom line for a new programmer (or at least it was for me when I was learning)
print(paste0("Found", length(fnFs), "forward reads and", length(fnRs), "reverse reads\n"))## [1] "Found19forward reads and19reverse reads\n"
## [1] "Sample names:F3D0, F3D1, F3D141, F3D142, F3D143, F3D144, F3D145, F3D146, F3D147, F3D148, F3D149, F3D150, F3D2, F3D3, F3D5, F3D6, F3D7, F3D8, F3D9\n"
if (length(fnFs) > 0) {
# Plot quality profiles for forward reads
plotQualityProfile(fnFs[1:min(2, length(fnFs))]) +
ggtitle("Forward Read Quality Profile")
# Plot quality profiles for reverse reads
plotQualityProfile(fnRs[1:min(2, length(fnRs))]) +
ggtitle("Reverse Read Quality Profile")
}# Place filtered files in filtered/ subdirectory
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
# Filter and trim
if (length(fnFs) > 0) {
out <- filterAndTrim(
fnFs, filtFs, fnRs, filtRs,
truncLen = config$dada2$filter$truncLen,
maxN = config$dada2$filter$maxN,
maxEE = config$dada2$filter$maxEE,
truncQ = config$dada2$filter$truncQ,
rm.phix = config$dada2$filter$rm.phix,
compress = config$dada2$filter$compress,
multithread = TRUE
)
print(head(out))
# Save filtering statistics
if (config$output$save_intermediate) {
write.csv(out, file.path(table_path, "filtering_stats.csv"))
}
}## reads.in reads.out
## F3D0_S188_L001_R1_001.fastq 7793 7113
## F3D1_S189_L001_R1_001.fastq 5869 5299
## F3D141_S207_L001_R1_001.fastq 5958 5463
## F3D142_S208_L001_R1_001.fastq 3183 2914
## F3D143_S209_L001_R1_001.fastq 3178 2941
## F3D144_S210_L001_R1_001.fastq 4827 4312
if (length(fnFs) > 0) {
# Learn forward read errors
errF <- learnErrors(filtFs,
nbases = config$dada2$learn_errors$nbases,
multithread = TRUE)
# Learn reverse read errors
errR <- learnErrors(filtRs,
nbases = config$dada2$learn_errors$nbases,
multithread = TRUE)
# Plot error rates
plotErrors(errF, nominalQ = TRUE) +
ggtitle("Forward Read Error Rates")
plotErrors(errR, nominalQ = TRUE) +
ggtitle("Reverse Read Error Rates")
}## 2978880 total bases in 12412 reads from 2 samples will be used for learning the error rates.
## 2860000 total bases in 17875 reads from 3 samples will be used for learning the error rates.
if (length(fnFs) > 0) {
# Apply core DADA2 sample inference algorithm
dadaFs <- dada(filtFs, err = errF, multithread = TRUE)
dadaRs <- dada(filtRs, err = errR, multithread = TRUE)
# Inspect the returned dada-class object
cat("DADA2 inference on first sample:\n")
print(dadaFs[[1]])
}## Sample 1 - 7113 reads in 1979 unique sequences.
## Sample 2 - 5299 reads in 1639 unique sequences.
## Sample 3 - 5463 reads in 1477 unique sequences.
## Sample 4 - 2914 reads in 904 unique sequences.
## Sample 5 - 2941 reads in 939 unique sequences.
## Sample 6 - 4312 reads in 1267 unique sequences.
## Sample 7 - 6741 reads in 1756 unique sequences.
## Sample 8 - 4560 reads in 1438 unique sequences.
## Sample 9 - 15637 reads in 3590 unique sequences.
## Sample 10 - 11413 reads in 2762 unique sequences.
## Sample 11 - 12017 reads in 3021 unique sequences.
## Sample 12 - 5032 reads in 1566 unique sequences.
## Sample 13 - 18075 reads in 3707 unique sequences.
## Sample 14 - 6250 reads in 1479 unique sequences.
## Sample 15 - 4052 reads in 1195 unique sequences.
## Sample 16 - 7369 reads in 1832 unique sequences.
## Sample 17 - 4765 reads in 1183 unique sequences.
## Sample 18 - 4871 reads in 1382 unique sequences.
## Sample 19 - 6504 reads in 1709 unique sequences.
## Sample 1 - 7113 reads in 1660 unique sequences.
## Sample 2 - 5299 reads in 1349 unique sequences.
## Sample 3 - 5463 reads in 1335 unique sequences.
## Sample 4 - 2914 reads in 853 unique sequences.
## Sample 5 - 2941 reads in 880 unique sequences.
## Sample 6 - 4312 reads in 1286 unique sequences.
## Sample 7 - 6741 reads in 1803 unique sequences.
## Sample 8 - 4560 reads in 1265 unique sequences.
## Sample 9 - 15637 reads in 3414 unique sequences.
## Sample 10 - 11413 reads in 2522 unique sequences.
## Sample 11 - 12017 reads in 2771 unique sequences.
## Sample 12 - 5032 reads in 1415 unique sequences.
## Sample 13 - 18075 reads in 3290 unique sequences.
## Sample 14 - 6250 reads in 1390 unique sequences.
## Sample 15 - 4052 reads in 1134 unique sequences.
## Sample 16 - 7369 reads in 1635 unique sequences.
## Sample 17 - 4765 reads in 1084 unique sequences.
## Sample 18 - 4871 reads in 1161 unique sequences.
## Sample 19 - 6504 reads in 1502 unique sequences.
## DADA2 inference on first sample:
## dada-class: object describing DADA2 denoising results
## 128 sequence variants were inferred from 1979 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
if (length(fnFs) > 0) {
# Merge paired end reads
mergers <- mergePairs(
dadaFs, filtFs,
dadaRs, filtRs,
minOverlap = config$dada2$merge$minOverlap,
maxMismatch = config$dada2$merge$maxMismatch,
verbose = TRUE
)
# Inspect the merger data.frame from the first sample
cat("Merged reads for first sample:\n")
print(head(mergers[[1]]))
}## Merged reads for first sample:
## sequence
## 1 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG
## 2 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG
## 3 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG
## 4 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG
## 5 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG
## 6 TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGATGCCAAGTCAGCGGTAAAAAAGCGGTGCTCAACGCCGTCGAGCCGTTGAAACTGGCGTTCTTGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG
## abundance forward reverse nmatch nmismatch nindel prefer accept
## 1 579 1 1 148 0 0 1 TRUE
## 2 470 2 2 148 0 0 2 TRUE
## 3 449 3 4 148 0 0 1 TRUE
## 4 430 4 3 148 0 0 2 TRUE
## 5 345 5 6 148 0 0 1 TRUE
## 6 282 6 5 148 0 0 2 TRUE
if (length(fnFs) > 0) {
# Construct sequence table
seqtab <- makeSequenceTable(mergers)
cat("Sequence table dimensions:", dim(seqtab), "\n")
cat("Sequence length distribution:\n")
print(table(nchar(getSequences(seqtab))))
}## Sequence table dimensions: 19 277
## Sequence length distribution:
##
## 251 252 253 254 255
## 1 85 184 5 2
if (length(fnFs) > 0) {
# Remove chimeric sequences
seqtab.nochim <- removeBimeraDenovo(
seqtab,
method = config$dada2$chimera$method,
multithread = TRUE,
verbose = TRUE
)
cat("Dimensions after chimera removal:", dim(seqtab.nochim), "\n")
cat("Proportion of non-chimeric sequences:",
sum(seqtab.nochim) / sum(seqtab), "\n")
}## Dimensions after chimera removal: 19 217
## Proportion of non-chimeric sequences: 0.9626902
if (length(fnFs) > 0) {
# Create function to get number of unique sequences
getN <- function(x) sum(getUniques(x))
# Build tracking table
track <- cbind(
out,
sapply(dadaFs, getN),
sapply(dadaRs, getN),
sapply(mergers, getN),
rowSums(seqtab.nochim)
)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR",
"merged", "nonchim")
rownames(track) <- sample.names
print(track)
# Save tracking table
write.csv(track, file.path(table_path, "read_tracking.csv"))
# Visualize read tracking
track_df <- as.data.frame(track)
track_df$sample <- rownames(track_df)
track_long <- tidyr::pivot_longer(
track_df,
cols = -sample,
names_to = "step",
values_to = "reads"
)
track_long$step <- factor(
track_long$step,
levels = c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
)
ggplot(track_long, aes(x = step, y = reads, group = sample, color = sample)) +
geom_line() +
geom_point() +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Read Tracking Through Pipeline",
x = "Pipeline Step",
y = "Number of Reads")
ggsave(file.path(fig_path, "read_tracking.png"),
width = 10, height = 6, dpi = 300)
}## input filtered denoisedF denoisedR merged nonchim
## F3D0 7793 7113 6999 7004 6577 6565
## F3D1 5869 5299 5227 5240 5024 5013
## F3D141 5958 5463 5339 5344 4960 4837
## F3D142 3183 2914 2796 2833 2597 2523
## F3D143 3178 2941 2823 2862 2553 2519
## F3D144 4827 4312 4175 4225 3625 3486
## F3D145 7377 6741 6594 6623 6072 5813
## F3D146 5021 4560 4448 4467 3936 3848
## F3D147 17070 15637 15440 15513 14051 12823
## F3D148 12405 11413 11253 11211 10359 9763
## F3D149 13083 12017 11867 11909 11177 10661
## F3D150 5509 5032 4864 4930 4313 4231
## F3D2 19620 18075 17905 17916 17317 16725
## F3D3 6758 6250 6147 6180 5854 5489
## F3D5 4448 4052 3948 3983 3737 3737
## F3D6 7989 7369 7237 7295 6857 6670
## F3D7 5129 4765 4651 4687 4438 4220
## F3D8 5294 4871 4781 4776 4554 4525
## F3D9 7070 6504 6342 6442 6095 6018
if (length(fnFs) > 0) {
if (analysis_type == "bacteria") {
cat("Assigning bacterial taxonomy using",
config$taxonomy$bacteria$database, "\n")
# Check if database files exist
db_path <- config$taxonomy$bacteria$silva_train_set
if (file.exists(db_path)) {
taxa <- assignTaxonomy(
seqtab.nochim,
db_path,
multithread = TRUE
)
# Add species-level annotation if available
species_db <- config$taxonomy$bacteria$silva_species
if (file.exists(species_db)) {
taxa <- addSpecies(taxa, species_db)
}
} else {
cat("Warning: Database file not found at", db_path, "\n")
cat("Creating placeholder taxonomy table\n")
taxa <- matrix(
"Unassigned",
nrow = nrow(seqtab.nochim),
ncol = 7
)
colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus", "Species")
}
} else if (analysis_type == "fungi") {
cat("Assigning fungal taxonomy using",
config$taxonomy$fungi$database, "\n")
# Check if database files exist
db_path <- config$taxonomy$fungi$unite_train_set
if (file.exists(db_path)) {
taxa <- assignTaxonomy(
seqtab.nochim,
db_path,
multithread = TRUE
)
} else {
cat("Warning: Database file not found at", db_path, "\n")
cat("Creating placeholder taxonomy table\n")
taxa <- matrix(
"Unassigned",
nrow = nrow(seqtab.nochim),
ncol = 7
)
colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus", "Species")
}
}
# Inspect taxonomic assignments
cat("\nTaxonomic assignments (first few):\n")
print(head(taxa))
# Save taxonomy table
if (config$output$save_intermediate) {
saveRDS(taxa, file.path(output_path, "taxonomy.rds"))
}
}## Assigning bacterial taxonomy using silva
##
## Taxonomic assignments (first few):
## Kingdom
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG "Bacteria"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG "Bacteria"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG "Bacteria"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG "Bacteria"
## TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGACTGCAACTGACACTGATGCTCGAAAGTGTGGGTATCAAACAGG "Bacteria"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG "Bacteria"
## Phylum
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG "Bacteroidota"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG "Bacteroidota"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG "Bacteroidota"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG "Bacteroidota"
## TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGACTGCAACTGACACTGATGCTCGAAAGTGTGGGTATCAAACAGG "Bacteroidota"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG "Bacteroidota"
## Class
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG "Bacteroidia"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG "Bacteroidia"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG "Bacteroidia"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG "Bacteroidia"
## TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGACTGCAACTGACACTGATGCTCGAAAGTGTGGGTATCAAACAGG "Bacteroidia"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG "Bacteroidia"
## Order
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG "Bacteroidales"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG "Bacteroidales"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG "Bacteroidales"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG "Bacteroidales"
## TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGACTGCAACTGACACTGATGCTCGAAAGTGTGGGTATCAAACAGG "Bacteroidales"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG "Bacteroidales"
## Family
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG "Muribaculaceae"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG "Muribaculaceae"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG "Muribaculaceae"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG "Muribaculaceae"
## TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGACTGCAACTGACACTGATGCTCGAAAGTGTGGGTATCAAACAGG "Bacteroidaceae"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG "Muribaculaceae"
## Genus
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG NA
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG NA
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG NA
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG NA
## TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGACTGCAACTGACACTGATGCTCGAAAGTGTGGGTATCAAACAGG "Bacteroides"
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG NA
## Species
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGAAGATCAAGTCAGCGGTAAAATTGAGAGGCTCAACCTCTTCGAGCCGTTGAAACTGGTTTTCTTGAGTGAGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCTCAACTGACGCTCATGCACGAAAGTGTGGGTATCGAACAGG NA
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGACTCTCAAGTCAGCGGTCAAATCGCGGGGCTCAACCCCGTTCCGCCGTTGAAACTGGGAGCCTTGAGTGCGCGAGAAGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTACCGGCGCGCAACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGG NA
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTGTTAAGTCAGCGGTCAAATGTCGGGGCTCAACCCCGGCCTGCCGTTGAAACTGGCGGCCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCATACCGGCGCCCGACTGACGCTGAGGCACGAAAGCGTGGGTATCGAACAGG NA
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGGCTTTTAAGTCAGCGGTAAAAATTCGGGGCTCAACCCCGTCCGGCCGTTGAAACTGGGGGCCTTGAGTGGGCGAGAAGAAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCCTTCCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCGAACAGG NA
## TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGACTGCAACTGACACTGATGCTCGAAAGTGTGGGTATCAAACAGG NA
## TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCCTGCCAAGTCAGCGGTAAAATTGCGGGGCTCAACCCCGTACAGCCGTTGAAACTGCCGGGCTCGAGTGGGCGAGAAGTATGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACCCCGATTGCGAAGGCAGCATACCGGCGCCCTACTGACGCTGAGGCACGAAAGTGCGGGGATCAAACAGG NA
if (length(fnFs) > 0) {
# Read sample metadata if it exists
metadata_file <- config$metadata$file
if (file.exists(metadata_file)) {
samdf <- read.csv(metadata_file, row.names = 1)
cat("Loaded metadata for", nrow(samdf), "samples\n")
} else {
cat("Warning: Metadata file not found. Creating minimal metadata.\n")
samdf <- data.frame(
SampleID = sample.names,
row.names = sample.names
)
}
# Create phyloseq object
ps <- phyloseq(
otu_table(seqtab.nochim, taxa_are_rows = FALSE),
sample_data(samdf),
tax_table(taxa)
)
cat("\nPhyloseq object summary:\n")
print(ps)
# Save phyloseq object
if (config$output$save_intermediate) {
saveRDS(ps, file.path(output_path, "phyloseq_object.rds"))
}
}## Loaded metadata for 19 samples
##
## Phyloseq object summary:
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 217 taxa and 19 samples ]
## sample_data() Sample Data: [ 19 samples by 3 sample variables ]
## tax_table() Taxonomy Table: [ 217 taxa by 7 taxonomic ranks ]
if (length(fnFs) > 0) {
# Remove samples with fewer than minimum reads
min_reads <- config$phyloseq$min_reads
ps_filtered <- prune_samples(sample_sums(ps) >= min_reads, ps)
cat("Samples before filtering:", nsamples(ps), "\n")
cat("Samples after filtering (>= ", min_reads, " reads):",
nsamples(ps_filtered), "\n")
# Remove low prevalence taxa
prev_threshold <- config$phyloseq$prevalence_threshold
prevalence <- apply(otu_table(ps_filtered), 2, function(x) sum(x > 0) / length(x))
ps_filtered <- prune_taxa(prevalence >= prev_threshold, ps_filtered)
cat("Taxa after prevalence filtering (>=", prev_threshold, "):",
ntaxa(ps_filtered), "\n")
# Update phyloseq object
ps <- ps_filtered
# Save filtered phyloseq object
if (config$output$save_intermediate) {
saveRDS(ps, file.path(output_path, "phyloseq_filtered.rds"))
}
}## Samples before filtering: 19
## Samples after filtering (>= 1000 reads): 19
## Taxa after prevalence filtering (>= 0.05 ): 217
if (length(fnFs) > 0 && nsamples(ps) > 0) {
# Plot alpha diversity
p <- plot_richness(ps, measures = c("Shannon", "Simpson", "Chao1")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Alpha Diversity Measures")
print(p)
ggsave(file.path(fig_path, "alpha_diversity.png"),
width = 10, height = 6, dpi = 300)
}if (length(fnFs) > 0 && nsamples(ps) > 0) {
# Plot taxonomic composition at Phylum level
ps_phylum <- tax_glom(ps, "Phylum", NArm = TRUE)
p <- plot_bar(ps_phylum, fill = "Phylum") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Taxonomic Composition at Phylum Level",
x = "Sample",
y = "Abundance")
print(p)
ggsave(file.path(fig_path, "taxonomic_composition_phylum.png"),
width = 12, height = 8, dpi = 300)
}if (length(fnFs) > 0 && nsamples(ps) > 1) {
# Transform to relative abundance
ps_rel <- transform_sample_counts(ps, function(x) x / sum(x))
# Perform ordination (PCoA with Bray-Curtis)
ord <- ordinate(ps_rel, method = "PCoA", distance = "bray")
p <- plot_ordination(ps_rel, ord, type = "samples") +
theme_bw() +
labs(title = "PCoA Ordination (Bray-Curtis Distance)")
print(p)
ggsave(file.path(fig_path, "ordination_pcoa.png"),
width = 10, height = 8, dpi = 300)
}## R version 4.4.2 (2024-10-31)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sequoia 15.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] tidyr_1.3.1 dplyr_1.1.4 yaml_2.3.10 ggplot2_4.0.1
## [5] phyloseq_1.50.0 dada2_1.34.0 Rcpp_1.1.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 deldir_2.0-4
## [3] permute_0.9-8 rlang_1.1.6
## [5] magrittr_2.0.4 ade4_1.7-23
## [7] matrixStats_1.5.0 compiler_4.4.2
## [9] mgcv_1.9-1 systemfonts_1.3.1
## [11] png_0.1-8 vctrs_0.6.5
## [13] reshape2_1.4.5 stringr_1.6.0
## [15] pwalign_1.2.0 pkgconfig_2.0.3
## [17] crayon_1.5.3 fastmap_1.2.0
## [19] XVector_0.46.0 labeling_0.4.3
## [21] Rsamtools_2.22.0 rmarkdown_2.30
## [23] UCSC.utils_1.2.0 ragg_1.5.0
## [25] purrr_1.2.0 xfun_0.54
## [27] zlibbioc_1.52.0 cachem_1.1.0
## [29] GenomeInfoDb_1.42.3 jsonlite_2.0.0
## [31] biomformat_1.34.0 rhdf5filters_1.18.1
## [33] DelayedArray_0.32.0 Rhdf5lib_1.28.0
## [35] BiocParallel_1.40.2 jpeg_0.1-11
## [37] parallel_4.4.2 cluster_2.1.6
## [39] R6_2.6.1 bslib_0.9.0
## [41] stringi_1.8.7 RColorBrewer_1.1-3
## [43] GenomicRanges_1.58.0 jquerylib_0.1.4
## [45] SummarizedExperiment_1.36.0 iterators_1.0.14
## [47] knitr_1.50 IRanges_2.40.1
## [49] Matrix_1.7-1 splines_4.4.2
## [51] igraph_2.2.1 tidyselect_1.2.1
## [53] rstudioapi_0.17.1 abind_1.4-8
## [55] vegan_2.7-2 codetools_0.2-20
## [57] hwriter_1.3.2.1 lattice_0.22-6
## [59] tibble_3.3.0 plyr_1.8.9
## [61] Biobase_2.66.0 withr_3.0.2
## [63] ShortRead_1.64.0 S7_0.2.1
## [65] evaluate_1.0.5 survival_3.7-0
## [67] RcppParallel_5.1.11-1 Biostrings_2.74.1
## [69] pillar_1.11.1 BiocManager_1.30.27
## [71] MatrixGenerics_1.18.1 renv_1.1.5
## [73] foreach_1.5.2 stats4_4.4.2
## [75] generics_0.1.4 S4Vectors_0.44.0
## [77] scales_1.4.0 glue_1.8.0
## [79] tools_4.4.2 interp_1.1-6
## [81] data.table_1.17.8 GenomicAlignments_1.42.0
## [83] rhdf5_2.50.2 grid_4.4.2
## [85] ape_5.8-1 latticeExtra_0.6-31
## [87] nlme_3.1-166 GenomeInfoDbData_1.2.13
## [89] cli_3.6.5 textshaping_1.0.4
## [91] S4Arrays_1.6.0 gtable_0.3.6
## [93] sass_0.4.10 digest_0.6.39
## [95] BiocGenerics_0.52.0 SparseArray_1.6.2
## [97] farver_2.1.2 htmltools_0.5.8.1
## [99] multtest_2.62.0 lifecycle_1.0.4
## [101] httr_1.4.7 MASS_7.3-61
This pipeline has completed the following steps:
All output files have been saved to the output/
directory.